BindingSiteSet.txt

Releases

Comparison of the BindingSiteSet.txt file between the last releases.

Release version Date
10.6 2019 July
10.6.3 -
10.7 2020 April
10.8 2020 October
10.9 2021 April
10.10 2022 February
11.0 2022 August
11.0.1 (not public) 2022 September
11.0.2 (not public) 2022 September

Notes:

  • V10.6 excluded because it doesn’t include TU_ID, only TU names which are not unique; otherwise it’s almost the same as v10.7 which is included
  • Some formatting is done in order to uniformize columns accross versions (uppercase/lowercase in confidence levels, word or symbol +- for strand and effect, etc)
  • Starting v11.0.1 there are 2 evidence columns (site evidence and function evidence), here I merged them so I can compare with older versions

Comments on evidence and confidence issues:

  • Version 10.6 did not include confirmed TFBS error query? corregido por Cesar en la versión 10.7
  • Version 10.7 seems right
  • Version 10.8 seem to have too many confirmed TFBSs: erroneous cross-validations CV(SM) and CV(GEA) were added to all TFBS, sometimes even duplicated
  • Version 10.9 loses most evidence information: ?
  • Version 10.10 and 11.0 have mostly weak sites, and many NAs: only function evidence is taken into account (not site)
  • Version 11.0.1 does not have confirmed TFBSs: only site evidence is taken into account (not function)
  • Version 11.0.2 seems about right? congruent with version 10.7
## V10.6 excluded because it doesn't include TU_ID, only TU names which are not unique; otherwise it's almost the same as v10.7 which is included
## Some formatting is done in order to uniformize columns like strand and confidence (uppercase/lowercase, word or symbol +- for the strand, etc)
## Starting v11.0.1 there are 2 evidence columns, here I just merge them so I can compare with older versions

dir_versions <- c("10.7", "10.8", "10.9", "10.10", "11.0", "11.0.1", "11.0.2") 

tfbs_sets <- list()
tfbs_versions <- c()
for(v in dir_versions){
  version_tag <- paste0("v", v)
  set <- read.delim(paste0(dir_releases, "/", v, "/BindingSiteSet.tsv"), 
                    comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>% 
    dplyr::mutate(version = version_tag) %>%
    dplyr::mutate(strand = ifelse(strand == "reverse", "-", ifelse(strand == "forward", "+", NA))) %>%
    dplyr::mutate(confidence = tolower(confidence)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
    dplyr::mutate(coords = paste0(start, "_", stop)) 
  
  assign(paste0("tfbs_set_", version_tag), set)
  tfbs_sets[[version_tag]] <- set
  tfbs_versions <- c(tfbs_versions, version_tag)
  
}
all_tfbs <- bind_rows(tfbs_sets) %>%
  dplyr::mutate(version = factor(version, levels =  tfbs_versions)) %>%
  dplyr::mutate(effect = factor(effect, levels =  c("+", "-", "?"))) %>%
  dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))

all_tfbs_by_evidence <- all_tfbs %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>% #,
  tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") 

Overall number of TFBSs

tfbs_summary <- all_tfbs %>%
  dplyr::group_by(version) %>%
  dplyr::summarise(total = n()) %>%
  dplyr::arrange(version)

TFBS_num <- simple_bar(tfbs_summary, "version", "total") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of TFBSs", title = "")

TFBS_num

DT::datatable(tfbs_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Comments:

  • Sudden increase with 488 new entries in version 10.9

TFBS IDs

TFBS IDs shared between versions

tfbs_ids <- list()
for (v in tfbs_versions) {
    tfbs_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TFBS_ID)
}
UpSetR::upset(fromList(tfbs_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

Comments:

  • The oldest versions have less entries but more unique TFBS IDs
  • 421 TFBS IDs disappear from version 10.8 until now (red bar)
  • There are only 21 new IDs present in version 10.9 to 11.0.2 despite the sudden increase in total entries (blue bar)

Duplicated TFBS IDs

tfbs_ids_dupli <- all_tfbs %>%
  dplyr::group_by(version, TFBS_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(tfbs_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(tfbs_ids_dupli, aes(fill = occurrences, y = tfbs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(tfbs_ids_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = tfbs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")

dodge / dodge2 

Which TFBS IDs are most duplicated?

## TFBS IDs that have at least 10 entries in a given version
tfbs_ids_dupli_max <- all_tfbs %>%
  dplyr::group_by(version, TFBS_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
  dplyr::filter(occurrences >=10) %>%
  dplyr::arrange(desc(occurrences)) %>%
  pivot_wider(names_from = version,
              values_from = c(occurrences))

DT::datatable(tfbs_ids_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Coordinates

coordinates <- list()
for (v in tfbs_versions) {
    coordinates[[v]] <- unique( (get(paste0("tfbs_set_", v)))$coords)
}
UpSetR::upset(fromList(coordinates), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
                             )
              )

coords_dupli <- all_tfbs %>%
  dplyr::group_by(version, coords) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(coords_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = coords_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(coords_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = coords_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")

dodge / dodge2 

Which coordinates are most duplicated?

## coords that have at least 10 entries in a given version
coords_dupli_max <- all_tfbs %>%
  dplyr::group_by(version, coords) %>%
  dplyr::summarise(occurrences = n()) %>%
  dplyr::filter(occurrences >=10) %>%
  dplyr::arrange(desc(occurrences)) %>%
  pivot_wider(names_from = version,
              values_from = c(occurrences))

DT::datatable(coords_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Distance to TSS

ggplot(all_tfbs, aes(x = distance_TSS, y = version, fill = version)) +
  ggridges::geom_density_ridges(color = "white") +
  ggridges::theme_ridges() + 
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme(legend.position = "none") +
  xlim(-1000, 1000)

Distance to first gene

ggplot(all_tfbs, aes(x = distance_gene, y = version, fill = version)) +
  ggridges::geom_density_ridges(color = "white") +
  ggridges::theme_ridges() + 
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme(legend.position = "none") +
  xlim(-1000, 1000)

Effect

tfbs_effect_long <- all_tfbs %>% 
  group_by(version, effect) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBS", title = "")

stack <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tfbs_effect_long, aes(group = effect, y = value, x = version)) + 
  geom_line(aes(color = effect)) +
  geom_point(size = 2, aes(color = effect)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
effect_summary <- all_tfbs %>% 
  group_by(version, effect) %>% 
  summarise(value = n())  %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
   arrange(effect)

DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Evidence

tfbs_evidence_long <- all_tfbs_by_evidence %>% 
  group_by(version, evidence_code, evidence_name) %>% 
  summarise(value = n()) 

evidence_palette <- random_palette(length(unique(all_tfbs_by_evidence$evidence_code)))

##----
dodge <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBSs", title = "")

stack <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) + 
  geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tfbs_evidence_long, aes(group = evidence_name, y = value, x = version)) + 
  geom_line(aes(color = evidence_code)) +
  scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
  geom_point(size = 2, aes(color = evidence_code)) +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4)
tfbs_evidence_table <- all_tfbs_by_evidence %>% 
  group_by(evidence_code, evidence_name) %>% 
  summarise(version = concat_uniq(version)) 

DT::datatable(tfbs_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))
tfbs_evidence_shared <- list()
for (v in tfbs_versions) {
    tfbs_evidence_shared[[v]] <- unique((all_tfbs_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}

UpSetR::upset(fromList(tfbs_evidence_shared), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)

Confidence

tfbs_confidence_long <- all_tfbs %>% 
  group_by(version, confidence) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBS", title = "")

stack <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tfbs_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
confidence_summary <- all_tfbs %>% 
  group_by(version, confidence) %>% 
  summarise(value = n())  %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total"))) %>%
   mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
   arrange(confidence)

DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

RI IDs

ri_ids <- list()
for (v in tfbs_versions) {
    ri_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$RI_ID)
}
UpSetR::upset(fromList(ri_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9", "v10.10"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
)
              )

Duplicate RI IDs

ris_ids_dupli <- all_tfbs %>%
  dplyr::group_by(version, RI_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(ris_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(ris_ids_dupli, aes(fill = occurrences, y = ris_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique RI IDs", title = "RI ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(ris_ids_dupli %>% dplyr::filter(!occurrences %in% c("1")), aes(fill = occurrences, y = ris_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique RI IDs", title = "...minimum 2 copies")

dodge / dodge2 

TFs

tf_ids <- list()
for (v in tfbs_versions) {
    tf_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_ID)
}
UpSetR::upset(fromList(tf_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF IDs",
              queries = list(
                             list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7"), color = "red", active = T))
              )

tf_ids_gone <- rownames(fromList(tf_ids) %>% filter(v10.7 == 1 & v11.0.2 == 0))

tf_names <- list()
for (v in tfbs_versions) {
    tf_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF names",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

tf_names_gone <- rownames(fromList(tf_names) %>% filter(v10.7 == 1 & v11.0.2 == 0))
  • 5 TF ID disappear: ECK125257186, ECK120048948, ECK125257190, ECK120023539, ECK125257191 (red)
  • 2 TF names disappear: HigBA, YiaJ (red)

Promoters

promoter_name <- list()
for (v in tfbs_versions) {
    promoter_name[[v]] <- unique( (get(paste0("tfbs_set_", v)))$promoter)
}
UpSetR::upset(fromList(promoter_name), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique promoter names",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

TUs

tu_ids <- list()
for (v in tfbs_versions) {
    tu_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_ID)
}
UpSetR::upset(fromList(tu_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TU IDs",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

tu_names <- list()
for (v in tfbs_versions) {
    tu_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_name)
}
UpSetR::upset(fromList(tu_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TU names",
              queries = list(
                             list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Mapping v10.7 vs v11.0.2

Notes:

  • TFBSs from v10.7 and v11.0.2 are mapped by TFBS_ID and promoter name
    • TFBS_IDs are not unique, eg. a given TFBS_ID can be associated with several promoters and TUs
    • Some RI_IDs seem to have changed, all other parameters equal (ex. seems like Fis TFBS_ID ECK120011288 was associated with RI_ID ECK120034675, now ECK125300661)
    • Older versions of BindingSiteSet.txt did not come with a TU_ID column, and TU_name is not unique, and is sometimes missing

The following table displays the differences. Many (most?) differences seem to be caused by different promoter names

Ex.

  • in 10.7 micFp is regulated by Lrp (TFBS ID ECK120011644)
  • in 11.0.2 micFp1 and micFp2 are both regulated by the same TF and TFBS, and are linked to 2 distinct TUs (both TUs are named micF but have distinct IDs)
  • all three entries have the same evidence
## Join versions 10.7 and 11.0.2 by TFBS_ID and promoter name
tfbs_join_107_1102 <- tfbs_set_v10.7 %>%
  dplyr::full_join(tfbs_set_v11.0.2, by = c("TFBS_ID", "promoter"), suffix = c("_10.7", "_11.0.2")) %>%
  dplyr::arrange(TFBS_ID) %>%
  dplyr::select(TFBS_ID, promoter, starts_with("TF_name"), starts_with("TU"), starts_with("coords"), starts_with("evidence_1"), starts_with("confidence"))
  # select(TFBS_ID, promoter, everything())


tfbs_matches_107_1102 <- tfbs_join_107_1102 %>% na.omit

tfbs_differences_107_1102 <- dplyr::setdiff(tfbs_join_107_1102, tfbs_matches_107_1102)

write.table(tfbs_join_107_1102, file = paste0(dir_results, "/TFBS_full_join_107_1102.tsv"), quote = F, row.names = F, col.names = T, sep = "\t")
write.table(tfbs_differences_107_1102, file = paste0(dir_results, "/TFBS_differences_107_1102.tsv"), quote = F, row.names = F, col.names = T, sep = "\t")

DT::datatable(tfbs_differences_107_1102, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 5,
                                                                                columnDefs = list(list(width = '200px', targets = c(11,12)))
                                                                                ))

network_tf_gene.txt

dir_versions <- c("10.6.3", "10.7", "10.8", "10.9", "11.0", "11.0.1")
tfgnw_sets <- list()
tfgnw_versions <- c()

for(v in dir_versions){
  version_tag <- paste0("v", v)
  set <- read.delim(paste0(dir_releases, "/", v, "/network_tf_gene.tsv"), 
                    comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>% 
    dplyr::mutate(effect = ifelse(effect == "repressor", "-", ifelse(effect == "activator", "+", ifelse(effect == "unknown", "?", effect)))) %>%
    dplyr::mutate(version = version_tag) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
    dplyr::mutate(pairs = paste0(TF_name, "_", gene_name)) 

  
  assign(paste0("tfgnw_set_", version_tag), set)
  tfgnw_sets[[version_tag]] <- set
  tfgnw_versions <- c(tfgnw_versions, version_tag)
  
}
all_tfgnw <- bind_rows(tfgnw_sets) %>%
  dplyr::mutate(version = factor(version, levels =  tfgnw_versions)) %>%
  dplyr::mutate(effect = factor(effect, levels =  c("+", "-", "?"))) %>%
  dplyr::mutate(confidence = tolower(confidence)) %>%
  dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))

all_tfgnw_by_evidence <- all_tfgnw %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = trimws(gsub("\\[|\\]", "", evidence))) %>% 
  dplyr::mutate(evidence_code = "",
                evidence_level = "",
                evidence_name = "") %>%
  dplyr::rowwise() %>%
  dplyr::mutate(evidence_code = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][1], evidence),
                evidence_level = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][2], NA),
                evidence_name = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][3], NA)
                )

NB: additional TAB characters at the end of each line cause parsing issues

Releases

Comparison of the network_tf_gene.txt file between the last releases.

Release version Date
10.6 (NA) 2019 July
10.6.3 -
10.7 2020 April
10.8 2020 October
10.9 2021 April
10.10 (NA) 2022 February
11.0 2022 August
11.0.1 (not public) 2022 September
tfgnw_summary <- all_tfgnw %>%
  dplyr::group_by(version) %>%
  dplyr::summarise(total = n()) %>%
  dplyr::arrange(version)

tfgnw_num <- simple_bar(tfgnw_summary, "version", "total") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of TF-gene entries", title = "")

tfgnw_num

DT::datatable(tfgnw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

TF-gene pairs

pairs <- list()
for (v in tfgnw_versions) {
    pairs[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$pairs)
}
UpSetR::upset(fromList(pairs), sets = tfgnw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
              #                list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
              #                )
              )

coords_dupli <- all_tfgnw %>%
  dplyr::group_by(version, pairs) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(pairs_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TF-gene pairs", title = "TF-gene pairs duplication in network_tf_gene.txt across versions")

dodge2 <- ggplot(coords_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique TF-gene pairs", title = "...minimum 5 copies")

dodge / dodge2 

Which pairs are most duplicated?

## pairs that have at least 10 entries in a given version
pairs_dupli_max <- all_tfgnw %>%
  dplyr::group_by(version, pairs) %>%
  dplyr::summarise(occurrences = n()) %>%
  dplyr::filter(occurrences >=10) %>%
  dplyr::arrange(desc(occurrences)) %>%
  pivot_wider(names_from = version,
              values_from = c(occurrences))

DT::datatable(pairs_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

TFs

tf_names <- list()
for (v in tfgnw_versions) {
    tf_names[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tfgnw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF names",
              # queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Genes

gene_names <- list()
for (v in tfgnw_versions) {
    gene_names[[v]] <- unique( (get(paste0("tfgnw_set_", v)))$gene_name)
}
UpSetR::upset(fromList(gene_names), sets = tfgnw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF names",
              # queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Effect

tfgnw_effect_long <- all_tfgnw %>% 
  group_by(version, effect) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfgnw_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tfgnw", title = "")

stack <- ggplot(tfgnw_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of tfgnw", title = "")

line <-  ggplot(tfgnw_effect_long, aes(group = effect, y = value, x = version)) + 
  geom_line(aes(color = effect)) +
  geom_point(size = 2, aes(color = effect)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of tfgnw", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
effect_summary <- all_tfgnw %>% 
  group_by(version, effect) %>% 
  summarise(value = n())  %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
   arrange(effect)

DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Evidence

tfgnw_evidence_long <- all_tfgnw_by_evidence %>% 
  group_by(version, evidence_code) %>% 
  summarise(value = n(), evidence_name = concat_uniq(evidence_name)) ## ojo que solo la 11.0.1 tiene evidence name

evidence_palette <- random_palette(length(unique(all_tfgnw_by_evidence$evidence_code)))

##----
dodge <- ggplot(tfgnw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tfgnws", title = "")

stack <- ggplot(tfgnw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tfgnw_evidence_long, aes(group = evidence_code, y = value, x = version)) + 
  geom_line(aes(color = evidence_code)) +
  scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
  geom_point(size = 2, aes(color = evidence_code)) +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of tfgnw", title = "")

fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4)
tfgnw_evidence_table <- all_tfgnw_by_evidence %>% 
  group_by(evidence_code) %>% 
  summarise(value = n(), evidence_name = concat_uniq(evidence_name), version = concat_uniq(version)) ## ojo que solo la 11.0.1 tiene evidence name

DT::datatable(tfgnw_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))
## to do remove empty string from evidence table
tfgnw_evidence_shared <- list()
for (v in tfgnw_versions) {
    tfgnw_evidence_shared[[v]] <- unique((all_tfgnw_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}

UpSetR::upset(fromList(tfgnw_evidence_shared), sets = tfgnw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)

Confidence

tfgnw_confidence_long <- all_tfgnw %>% 
  group_by(version, confidence) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfgnw_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tfgnw", title = "")

stack <- ggplot(tfgnw_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of tfgnw", title = "")

line <-  ggplot(tfgnw_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of tfgnw", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(tfgnw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
confidence_summary <- all_tfgnw %>% 
  group_by(version, confidence) %>% 
  summarise(value = n())  %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total"))) %>%
   mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
   arrange(confidence)

DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

network_tf_tu.txt

dir_versions <- c("10.6.3", "10.7", "10.8", "10.9", "11.0", "11.0.1")
tftunw_sets <- list()
tftunw_versions <- c()

for(v in dir_versions){
  version_tag <- paste0("v", v)
  set <- read.delim(paste0(dir_releases, "/", v, "/network_tf_tu.tsv"), 
                    comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>% 
    dplyr::mutate(effect = ifelse(effect == "repressor", "-", ifelse(effect == "activator", "+", ifelse(effect == "unknown", "?", effect)))) %>%
    dplyr::mutate(version = version_tag) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
    dplyr::mutate(pairs = paste0(TF_name, "_", TU_name)) 

  
  assign(paste0("tftunw_set_", version_tag), set)
  tftunw_sets[[version_tag]] <- set
  tftunw_versions <- c(tftunw_versions, version_tag)
  
}
all_tftunw <- bind_rows(tftunw_sets) %>%
  dplyr::mutate(version = factor(version, levels =  tftunw_versions)) %>%
  dplyr::mutate(effect = factor(effect, levels =  c("+", "-", "?"))) %>%
  dplyr::mutate(confidence = tolower(confidence)) %>%
  dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))

all_tftunw_by_evidence <- all_tftunw %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = trimws(gsub("\\[|\\]", "", evidence))) %>% 
  dplyr::mutate(evidence_code = "",
                evidence_level = "",
                evidence_name = "") %>%
  dplyr::rowwise() %>%
  dplyr::mutate(evidence_code = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][1], evidence),
                evidence_level = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][2], NA),
                evidence_name = ifelse(grepl("\\|", evidence), stringr::str_split(evidence, "\\|")[[1]][3], NA)
                )

NB: additional TAB characters at the end of each line cause parsing issues

Releases

Comparison of the network_tf_gene.txt file between the last releases.

Release version Date
10.6 (NA) 2019 July
10.6.3 -
10.7 2020 April
10.8 2020 October
10.9 2021 April
10.10 (NA) 2022 February
11.0 2022 August
11.0.1 (not public) 2022 September
tftunw_summary <- all_tftunw %>%
  dplyr::group_by(version) %>%
  dplyr::summarise(total = n()) %>%
  dplyr::arrange(version)

tftunw_num <- simple_bar(tftunw_summary, "version", "total") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of TF-gene entries", title = "")

tftunw_num

DT::datatable(tftunw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

TF-gene pairs

pairs <- list()
for (v in tftunw_versions) {
    pairs[[v]] <- unique( (get(paste0("tftunw_set_", v)))$pairs)
}
UpSetR::upset(fromList(pairs), sets = tftunw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
              #                list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
              #                )
              )

coords_dupli <- all_tftunw %>%
  dplyr::group_by(version, pairs) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(pairs_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TF-gene pairs", title = "TF-gene pairs duplication in network_tf_gene.txt across versions")


dodge 

TFs

tf_names <- list()
for (v in tftunw_versions) {
    tf_names[[v]] <- unique( (get(paste0("tftunw_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tftunw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF names",
              # queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Genes

TU_names <- list()
for (v in tftunw_versions) {
    TU_names[[v]] <- unique( (get(paste0("tftunw_set_", v)))$TU_name)
}
UpSetR::upset(fromList(TU_names), sets = tftunw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TU names",
              # queries = list(list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Effect

tftunw_effect_long <- all_tftunw %>% 
  group_by(version, effect) %>% 
  summarise(value = n()) 

dodge <- ggplot(tftunw_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tftunw", title = "")

stack <- ggplot(tftunw_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of tftunw", title = "")

line <-  ggplot(tftunw_effect_long, aes(group = effect, y = value, x = version)) + 
  geom_line(aes(color = effect)) +
  geom_point(size = 2, aes(color = effect)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of tftunw", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
effect_summary <- all_tftunw %>% 
  group_by(version, effect) %>% 
  summarise(value = n())  %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
   arrange(effect)

DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Evidence

tftunw_evidence_long <- all_tftunw_by_evidence %>% 
  group_by(version, evidence_code) %>% 
  summarise(value = n(), evidence_name = concat_uniq(evidence_name)) ## ojo que solo la 11.0.1 tiene evidence name

evidence_palette <- random_palette(length(unique(all_tftunw_by_evidence$evidence_code)))

##----
dodge <- ggplot(tftunw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tftunws", title = "")

stack <- ggplot(tftunw_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tftunw_evidence_long, aes(group = evidence_code, y = value, x = version)) + 
  geom_line(aes(color = evidence_code)) +
  scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
  geom_point(size = 2, aes(color = evidence_code)) +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of tftunw", title = "")

fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4)
tftunw_evidence_table <- all_tftunw_by_evidence %>% 
  group_by(evidence_code) %>% 
  summarise(value = n(), evidence_name = concat_uniq(evidence_name), version = concat_uniq(version)) ## ojo que solo la 11.0.1 tiene evidence name

DT::datatable(tftunw_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))
## to do remove empty string from evidence table
tftunw_evidence_shared <- list()
for (v in tftunw_versions) {
    tftunw_evidence_shared[[v]] <- unique((all_tftunw_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}

UpSetR::upset(fromList(tftunw_evidence_shared), sets = tftunw_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

## Revise congruency between table and upset plot (table has CE listed as only present in 10.8, shjould have 10.7 too)

Confidence

tftunw_confidence_long <- all_tftunw %>% 
  group_by(version, confidence) %>% 
  summarise(value = n()) 

dodge <- ggplot(tftunw_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of tftunw", title = "")

stack <- ggplot(tftunw_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of tftunw", title = "")

line <-  ggplot(tftunw_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of tftunw", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(tftunw_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
confidence_summary <- all_tftunw %>% 
  group_by(version, confidence) %>% 
  summarise(value = n())  %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total"))) %>%
   mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
   arrange(confidence)

DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

–>

Other

save.image(file = paste0("Binding_dataset_report.Rdata"))